import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import svm
from ipywidgets import interact
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from sklearn.neural_network import MLPClassifier
import plotly.express as px
import seaborn as sn
from pandas.plotting import scatter_matrix
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth',1000)
df=pd.read_excel("international_matches.xlsx")
df.tail(5)
df.info()
Columns like home_team_continent, away_team_continent, neutral_location, country, city,tournament, shoot_out are not pertinent to our objective hence we're removing them.
list(df.columns)
clean_data = df.drop(['home_team_continent', 'away_team_continent', 'neutral_location', 'country', 'city', 'tournament', 'shoot_out'], axis='columns')
We do not need match-data for teams that are not playing in FIFA 2022, so we'll be dropping the records for irrelevant matches. We won't be including matches that resulted in a draw.
playing_teams=[
"Qatar", "Ecuador", "Senegal", "Netherlands",
"England", "Iran", "USA", "Wales",
"Argentina", "Saudi Arabia", "Mexico", "Poland",
"France", "Australia", "Denmark", "Tunisia",
"Spain", "Costa Rica", "Germany", "Japan",
"Belgium", "Canada", "Morocco", "Croatia",
"Brazil", "Serbia", "Switzerland", "Cameroon",
"Portugal", "Ghana", "Uruguay", "Korea Republic"
]
clean_data = clean_data[(clean_data.home_team.isin(playing_teams) | clean_data.away_team.isin(playing_teams)) & (clean_data.home_team_result != "Draw")].reset_index(drop=True)
clean_data.columns
The column names home_team and away_team will be renamed to team_A and team_B since the home and awayprefixes are irrelevant with respect to the World Cup.
clean_data = clean_data.rename(columns={col: col.replace("home_team", "team_A").replace("away_team", "team_B") for col in clean_data.columns})
We'll add a new column result, which is set to 1 when team_A wins and set to 0 when team_B wins.
clean_data['result'] = clean_data.team_A_result.apply(lambda x: 1 if x == "Win" else 0)
clean_data.drop(['team_A_result'], axis='columns', inplace=True)
clean_data.columns
The following cell creates country wise statistics.
country_data_A = clean_data.drop([col for col in clean_data.columns if "team_B" in col], axis='columns')
country_data_A.rename(columns = {"team_A": "country"}, inplace=True)
country_data_A.rename(columns = {col : col.replace("team_A_", "") for col in country_data_A.columns}, inplace=True)
country_data_B = clean_data.drop([col for col in clean_data.columns if "team_A" in col], axis='columns')
country_data_B.rename(columns = {"team_B": "country"}, inplace=True)
country_data_B.rename(columns = {col : col.replace("team_B_", "") for col in country_data_B.columns}, inplace=True)
country_data_B['result'] = 1 - country_data_B['result']
country_data = pd.concat([country_data_A, country_data_B])
country_data.tail(10)
# def get_running_avgs(row):
# prev_match_a = country_data[(country_data.date < row['date']) & (country_data.country == row['team_A'])].sort_values(by = 'date').head(5)
# prev_match_b = country_data[(country_data.date < row['date']) & (country_data.country == row['team_B'])].sort_values(by = 'date').head(5)
# if prev_match_a.shape[0] == 0:
# row['avg_score_A'] = row['team_A_score']
# row['avg_gk_score_A'] = row['team_A_goalkeeper_score']
# row['avg_defense_A'] = row['team_A_mean_defense_score']
# row['avg_mid_A'] = row['team_A_mean_midfield_score']
# row['avg_offense_A'] = row['team_A_mean_offense_score']
# else:
# row['avg_score_A'] = prev_match_a['score'].mean()
# row['avg_gk_score_A'] = prev_match_a['goalkeeper_score'].mean()
# row['avg_defense_A'] = prev_match_a['mean_defense_score'].mean()
# row['avg_mid_A'] = prev_match_a['mean_midfield_score'].mean()
# row['avg_offense_A'] = prev_match_a['mean_offense_score'].mean()
# if prev_match_b.shape[0] == 0:
# row['avg_score_B'] = row['team_B_score']
# row['avg_gk_score_B'] = row['team_B_goalkeeper_score']
# row['avg_defense_B'] = row['team_B_mean_defense_score']
# row['avg_mid_B'] = row['team_B_mean_midfield_score']
# row['avg_offense_B'] = row['team_B_mean_offense_score']
# else:
# row['avg_score_B'] = prev_match_b['score'].mean()
# row['avg_gk_score_B'] = prev_match_b['goalkeeper_score'].mean()
# row['avg_defense_B'] = prev_match_b['mean_defense_score'].mean()
# row['avg_mid_B'] = prev_match_b['mean_midfield_score'].mean()
# row['avg_offense_B'] = prev_match_b['mean_offense_score'].mean()
# return row
data = clean_data.copy(deep=True)
column_list = ['fifa_rank','total_fifa_points','score','goalkeeper_score', 'mean_defense_score', 'mean_offense_score', 'mean_midfield_score']
fig = px.scatter_matrix(country_data[column_list],labels={col:col.replace('_score', '') for col in column_list})
fig.update_layout({"title": "Scatter Plot Matrix Country Wise Data" ,'height': 800, 'width': 1000, 'font_size':8.6})
fig.update_layout({"xaxis"+str(i+1): dict(tickangle = -45) for i in range(7)})
fig.update_traces(diagonal_visible=False,marker={'size': 3})
fig.show()
We can see that the columns fifa_rank and total_fifa_points are negatively correlated, which is as expected since teams with a good rank are expected to have higher rank. goalkeeper_score is positively correlated with total_fifa_points, mean_denfense, mean_offense. The following cell shows the correlation matrix for the same data
sn.heatmap(country_data[column_list].corr(),annot=True)
plt.show()
We can see that mean_defense_score, mean_offense_score, goalkeeper_score, have high correlation among them, this can be a cause for multicollinearity, so we will only include mean_midfield_score, goalkeeper_score in the model.
ctry_data = country_data.copy(deep=True)
ctry_data['year'] = ctry_data['date'].dt.to_period('Y')
grp_data = ctry_data.groupby(['country', 'year'], as_index=False).agg({'score':'mean', 'result':['count','sum'], 'goalkeeper_score':'mean', 'mean_defense_score':'mean', 'mean_offense_score':'mean','mean_midfield_score':'mean'})
wc_semi = ctry_data[ctry_data.country.isin(["Morocco", "France", "Argentina", "Croatia"])]
wc_semi = wc_semi.groupby("country", as_index=False).mean()
fig = px.bar(wc_semi,
x="country",
y="goalkeeper_score",
color="country",
text_auto=True,
labels={
"country": "Country",
"goalkeeper_score": "Goalkeeper Score",
})
fig.update_layout({"title": "GoalKeeper Score vs Country (2022 Season) Top 4"})
fig.show()
fig = px.bar(wc_semi,
x="country",
y="mean_midfield_score",
color="country",
text_auto=True,
labels = {
"country": "Country",
"mean_midfield_score": "Mean Midfield Score"
},
title = "Mean Midfield Score vs Country (2022 Season) Top 4")
fig.show()
Looking at the difference between the goals score by winner and the loser, the following chart shows the goal difference for the top 4 teams
import warnings
warnings.filterwarnings("ignore")
wc_data = data[data.team_A.isin(["Morocco", "France", "Argentina", "Croatia"])]
wc_data['goal_diff'] = wc_data['team_A_score'] - wc_data['team_B_score']
wc_data.rename(columns = {'team_A' : 'country'}, inplace=True)
wc_data2 = data[data.team_B.isin(["Morocco", "France", "Argentina", "Croatia"])]
wc_data2['goal_diff'] = wc_data['team_B_score'] - wc_data['team_A_score']
wc_data2.rename(columns = {'team_B': 'country'}, inplace=True)
wc_data = pd.concat([wc_data, wc_data2])[['country','goal_diff','date']]
wc_data_grp = wc_data[wc_data.date >= "2015-01-01"].groupby('country', as_index=False).mean().sort_values(by='goal_diff', ascending=False)
fig = px.bar(wc_data_grp, x='country',y='goal_diff',color='country', title = "Goal Difference vs Country",
labels= {
'country': 'Country',
'goal_diff': 'Goal Difference(Top 4)'
})
fig.show()
fifa_rank is an important variable. Given two teams the difference between their fifa_ranks should be a good metric. Difference between fifa_points is also a good metric.
clean_data['rank_diff'] = clean_data['team_A_fifa_rank'] - clean_data['team_B_fifa_rank']
clean_data['point_diff'] = clean_data['team_A_total_fifa_points'] - clean_data['team_B_total_fifa_points']
The following cell calculates the win_rate for each rank_diff
wr_rd = clean_data.groupby('rank_diff', as_index=False).agg({'result': ['sum','count']})
wr_rd.columns = [col[0] + col[1] for col in wr_rd.columns]
wr_rd['win_rate'] = wr_rd['resultsum']/wr_rd['resultcount']
fig = px.scatter(
x=wr_rd['rank_diff'],
y=wr_rd['win_rate'],
title="Win Rate vs Rank Diff",
labels={
'x': 'Rank Difference',
'y': 'Win Rate'
},
height=500,
width=500
)
fig.update_layout()
fig.show()
We can see that when rank_diff > 0, meaning opposition is a better team, the bigger the difference the smaller the win_rate.
def plot_cm(matrix, title):
cm = [[val/sum(row) for val in row] for row in matrix]
plt.figure(figsize=(5,5))
sn.heatmap(cm, annot=True)
plt.title(title)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
X = clean_data[[ 'team_A', 'team_B', 'rank_diff', 'point_diff', 'team_A_mean_midfield_score', 'team_B_mean_midfield_score','team_A_goalkeeper_score','team_B_goalkeeper_score']]
Y = clean_data['result']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
categorical_features = ['team_A', 'team_B']
def logistic_classification_model(X_train, X_test, y_train, y_test):
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(transformers = [
("cat", categorical_transformer, categorical_features)
])
clf = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(class_weight='balanced'))]
)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
cm = confusion_matrix(y_test, pred)
return clf,pred,score, cm
logistic_clf,predictions,logistic_acc,logistic_cm = logistic_classification_model(X_train, X_test, y_train, y_test)
pred = logistic_clf.predict(X_test)
logistic_acc
logistic_cm
plot_cm(logistic_cm, "Logistic Model")
def random_forest_model(X_train, X_test, y_train, y_test):
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(transformers = [
("cat", categorical_transformer, categorical_features)
])
rand_forest_clf = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(class_weight='balanced'))]
)
rand_forest_clf.fit(X_train, y_train)
score = rand_forest_clf.score(X_test,y_test)
pred = rand_forest_clf.predict(X_test)
cm = confusion_matrix(y_test, pred)
return rand_forest_clf, pred, score, cm
random_forest_model, preds, random_forest_score, random_forest_confusion_mat = random_forest_model(X_train, X_test, y_train, y_test)
random_forest_score
random_forest_confusion_mat
plot_cm(random_forest_confusion_mat, "Random Forest")
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow import feature_column
from tensorflow.keras.utils import to_categorical
checkpoint_filepath = './checkpoints'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath,
save_weights_only=True,
monitor='val_accuracy',
mode='max',
save_best_only=True)
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_accuracy',
verbose=1,
patience=20,
mode='max',
restore_best_weights=True)
data = clean_data[[ 'team_A', 'team_B', 'rank_diff', 'point_diff', 'team_A_mean_midfield_score', 'team_B_mean_midfield_score','team_A_goalkeeper_score','team_B_goalkeeper_score','result']].copy(deep=True)
le_a = LabelEncoder()
le_b = LabelEncoder()
le_a.fit(data['team_A'])
le_b.fit(data['team_B'])
data['team_B'] = le_b.transform(data['team_B'])
data['team_A'] = le_a.transform(data['team_A'])
data.dropna(inplace=True)
neg, pos = np.bincount(data['result'])
total = neg + pos
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}
train_df, test_df = train_test_split(data, test_size=0.2)
train_df, val_df = train_test_split(data, test_size=0.2)
train_labels = np.array(train_df.pop("result"))
test_labels = np.array(test_df.pop("result"))
val_labels = np.array(val_df.pop("result"))
train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)
model = tf.keras.Sequential([
layers.Dense(512, input_shape=(train_features.shape[-1],), activation=tf.nn.relu),
layers.Dense(512, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.01),
layers.Dense(256, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.01),
layers.Dense(256, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.01),
layers.Dense(128, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.01),
layers.Dense(1, activation=tf.nn.sigmoid)
])
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=1e-5),
loss = tf.keras.losses.binary_crossentropy,
metrics='accuracy'
)
history = model.fit(
train_features,
train_labels,
epochs=50,
batch_size=50,
class_weight=class_weight,
callbacks=[model_checkpoint_callback, early_stopping],
validation_data=(val_features, val_labels)
)
plt.plot(history.epoch, history.history['accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title("Accuracy Curve")
plt.show()
plt.plot(history.epoch, history.history['loss'])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.show()
test_predictions_baseline = model.predict(test_features, batch_size=10)
baseline_results = model.evaluate(test_features, test_labels,
batch_size=10, verbose=0)
model.evaluate(test_features, test_labels,
batch_size=10)
def plot_cm_tf(labels, predictions):
cm = confusion_matrix(labels, predictions)
cm = [[val/sum(row) for val in row] for row in cm]
plt.figure(figsize=(5,5))
sn.heatmap(cm, annot=True)
plt.title('Neural Network Confusion matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
test_preds_roff = np.round(test_predictions_baseline)
plot_cm_tf(test_labels, test_preds_roff)
def predict_outcome(team1, team2):
team1_data = country_data[(country_data.country == team1) & (country_data.date > "2019-06-01")]
team2_data = country_data[(country_data.country == team2) & (country_data.date > "2019-06-01")]
rank_team1 = team1_data.fifa_rank.median()
rank_team2 = team2_data.fifa_rank.median()
gk_score1 = team1_data.goalkeeper_score.mean()
gk_score2 = team2_data.goalkeeper_score.mean()
midfield_score1 = team1_data.mean_midfield_score.mean()
midfield_score2 = team2_data.mean_midfield_score.mean()
win_rate1 = team1_data.result.sum()/team1_data.shape[1]
win_rate2 =team2_data.result.sum()/team2_data.shape[1]
if rank_team1 == rank_team2:
rank_team2 = team2_data.fifa_rank.unique()[1]
fifa_point1 = team1_data.total_fifa_points.mean()
fifa_point2 = team2_data.total_fifa_points.mean()
fig1 = px.bar(
x=[team1, team2],
y=[fifa_point1, fifa_point2],
title="Avg FIFA Points (2022 Season)",
labels= {"x": "Country", "y": "FIFA Points"},
color=[team1, team2],
)
fig1.show()
fig2 = px.bar(
x=[team1, team2],
y=[win_rate1, win_rate2],
title="Win Rate (2022 Season)",
labels= {"x": "Country", "y": "Win Rate"},
color=[team1, team2],
)
fig2.show()
fig3 = px.bar(
x=[team1, team2],
y=[gk_score1, gk_score2],
title="Avg Goalkeeper Score (2022 Season)",
labels= {"x": "Country", "y": "Avg Goalkeeper Score"},
color=[team1, team2],
)
fig3.show()
fig4 = px.bar(
x=[team1, team2],
y=[gk_score1, gk_score2],
title="Avg Midfield Score (2022 Season)",
labels= {"x": "Country", "y": "Avg Midfield Score"},
color=[team1, team2],
)
fig4.show()
df = pd.DataFrame([{
"team_A": team1,
"team_B": team2,
"rank_diff": rank_team1 - rank_team2,
"point_diff": fifa_point1 - fifa_point2,
"team_A_mean_midfield_score": midfield_score1,
"team_B_mean_midfield_score": midfield_score2,
"team_A_goalkeeper_score": gk_score1,
"team_B_goalkeeper_score": gk_score2,
}])
#Logistic
display(df)
logistic_prediction = team1 if logistic_clf.predict(df)[0] == 1 else team2
random_forest_prediction = team1 if random_forest_model.predict(df)[0] == 1 else team2
df['team_B'] = le_b.transform(df['team_B'])
df['team_A'] = le_a.transform(df['team_A'])
mlp_prediction = model.predict(df, verbose=0)[0][0]
mlp_prediction = team1 if mlp_prediction > 0.5 else team2
predictions = pd.DataFrame(
[
{
"Model": "Logistic",
"Predicted Winner": logistic_prediction
},
{
"Model": "Random Forest",
"Predicted Winner": random_forest_prediction
},
{
"Model": "Multilayer Perceptron (Neural Network)",
"Predicted Winner": mlp_prediction
},
]
)
return predictions
country_list = country_data.country.unique()
country_list.sort()
team1_dropdown = widgets.Dropdown(
options=country_list,
value="France"
)
team2_dropdown = widgets.Dropdown(
options=country_list,
value="Argentina"
)
op = interact(predict_outcome, team1=team1_dropdown, team2=team2_dropdown)
predict_outcome("France", "Argentina")